####################################
# Title - Event-Based Framing of Democracy in American News Media
# Date - May 28th, 2024
# Goal - Build dictionary of each model of democracy
####################################

rm(list=ls())

  # Library
  library(quanteda)
  library(dplyr)
  library(text2vec)
  library(conText)
  library(ggplot2)
  library(lubridate)
  library(viridis)
  library(tidyr)
  library(corrplot)

  # Dataset
  data<-readRDS("data/nyt_cleaned_articles.rds")
  glove_vectors <- readRDS("data/glove.rds") # Embedding model is already trained by researchers at Stanford
  transform_vectors<-readRDS("data/khodakA.rds")

#########################
#
# Preprocessing
#
#########################

  data$pub_month_factor<-as.factor(data$pub_month)
  data$pub_year_factor<-as.factor(data$pub_year)
  data$pub_year<-as.numeric(data$pub_year)
  data$global<-1
  
  text_corpus <- corpus(data,
                        docid_field = "docid",
                        text_field = "clean_text",
                        meta = T)

  # tokenize corpus removing unnecessary (i.e. semantically uninformative) elements
  toks <- tokens(text_corpus, remove_punct=T, remove_symbols=T, remove_numbers=F, remove_url = T, remove_separators=T)

  # clean out stopwords and words with 2 or fewer characters
  toks_nostop <- tokens_select(toks, pattern = stopwords("en"), selection = "remove", min_nchar=3)

  # only use features that appear at least 5 times in the corpus
  feats <- dfm(toks_nostop, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 5) %>% featnames()

  # check spelling. toupper avoids names being considered misspelled
  if (requireNamespace("hunspell", quietly = TRUE)) {
    library(hunspell) # spell check library
    spellcheck <-  hunspell_check(toupper(feats), dict = hunspell::dictionary("en_US"))
    feats <- feats[spellcheck]
  }

  # leave the pads so that non-adjacent words will not become adjacent
  toks_nostop_feats <- tokens_select(toks_nostop, feats, padding = TRUE)


  # build a tokenized corpus of contexts surrounding the target term "democracy"
  demo_toks <- tokens_context(x = toks_nostop_feats, pattern = c("democracy", "Democracy", "DEMOCRACY",
                                                                 "democracies", "Democracies", "DEMOCRACIES",
                                                                 "democracy's", "Democracy's", "DEMOCRACY's",
                                                                 "democratic"), window = 6L,
                              valuetype = "fixed", case_insensitive = FALSE, hard_cut = FALSE, rm_keyword = FALSE,
                              verbose = TRUE) #No regex which include "undemocracy

  rm(text_corpus, toks, toks_nostop, feats)

  # build document-feature matrix
  demo_dfm <- dfm(demo_toks)
  
  # build a document-embedding-matrix
  demo_dem <- dem(x = demo_dfm, 
                  pre_trained = glove_vectors, # pre-trained glovel model with 300-dim 
                  transform = TRUE, 
                  transform_matrix = transform_vectors, 
                  verbose = TRUE)
  
  # to get a single "corpus-wide" embedding, take the column average
  demo_wv <- matrix(colMeans(demo_dem), ncol = ncol(demo_dem)) %>%  `rownames<-`("democracy")
  dim(demo_wv)
  
##############################
#
# 2. Cosine similarity for Electoral Democracy
#
##############################
  
  feats <- featnames(dfm(demo_toks))

  # Electoral Democracy
    target_word <- c("election", "suffrage", "vote", "majority", "voter") 
    
    list_nns<-list()
    
    for (target_word_i in target_word){
      
      #Create context
      context_tock <- tokens_context(x = toks_nostop_feats, pattern = target_word_i, window = 6L,
                                  valuetype = "glob", case_insensitive = TRUE, hard_cut = FALSE, rm_keyword = TRUE,
                                  verbose = TRUE)
      
      docvars(context_tock)$global<-1
      set.seed(2021L)
      word_nns <- get_nns(x = context_tock,
                          N = 5,
                          groups = docvars(context_tock, 'global'),
                          candidates = feats,
                          pre_trained = glove_vectors,
                          transform = TRUE,
                          transform_matrix = transform_vectors,
                          bootstrap = TRUE,
                          num_bootstraps = 100,
                          confidence_level = 0.95,
                          stem = FALSE,
                          as_list = FALSE)
      
      list_nns[[target_word_i]]<-word_nns
    }  
  
    result_df <- data.frame(target = NA,
                            feature = NA,
                            rank = NA,
                            value = NA,
                            std.error = NA,
                            lower.ci = NA,
                            upper.ci = NA)
  
    for (target_word_i in target_word) {
        b <- list_nns[[target_word_i]]  # Assuming demo_month_nns is a list
        b$target<-target_word_i
        result_df <- rbind(result_df, b)
      }
    result_df<-result_df[-1,]
    
    words_candidates <- unique(result_df$feature) # Read the key words and sort out relevant ones
    words_candidates
    words_candidates<-words_candidates[!(words_candidates %in% c("championing", "self-determination",
                                                                 "gun-control", "tyrannize"))]
    words_candidates <- unique(c(words_candidates, target_word, "multiparty"))
    
  #Connections to Democracy
    set.seed(2021L)
    target_cos_global<-get_cos_sim(x = demo_toks,
                                 groups = docvars(demo_toks, 'global'),
                                 features = words_candidates,
                                 pre_trained = glove_vectors,
                                 transform = TRUE,
                                 transform_matrix = transform_vectors,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 confidence_level = 0.95,
                                 stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                 as_list = FALSE)
    set.seed(2021L)
    target_cos_year<-get_cos_sim(x = demo_toks,
                                   groups = docvars(demo_toks, 'pub_year_factor'),
                                   features = words_candidates,
                                   pre_trained = glove_vectors,
                                   transform = TRUE,
                                   transform_matrix = transform_vectors,
                                   bootstrap = TRUE,
                                   num_bootstraps = 100,
                                   confidence_level = 0.95,
                                   stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                   as_list = FALSE)
    set.seed(2021L)
    target_cos_month<-get_cos_sim(x = demo_toks,
                                   groups = docvars(demo_toks, 'pub_month_factor'),
                                   features = words_candidates,
                                   pre_trained = glove_vectors,
                                   transform = TRUE,
                                   transform_matrix = transform_vectors,
                                   bootstrap = TRUE,
                                   num_bootstraps = 100,
                                   confidence_level = 0.95,
                                   stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                   as_list = FALSE)
    
    target_cos_month$target<-ymd(target_cos_month$target)
    
    print(target_cos_global %>% arrange(desc(value)), n=50)
    mean(target_cos_global$value)
    
    target_cos_year %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_electoral2_year.jpeg", width = 8, dpi = 1000)
    
    target_cos_month %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    #ggsave(filename = "fig/dictionary_electoral2_month.jpeg", width = 8, dpi = 1000)
   
    print(target_cos_year %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean)), n=50)
    print(target_cos_month %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean)),n=50)
    
  #Check the real usages
    list_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = demo_toks, N = 29440, as_list = FALSE)
    text_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = NULL, N = 29440, as_list = FALSE)
    
    list_ncs<-merge(list_ncs, text_ncs %>% select(-value, -target) %>% rename(text_num = context), by=c("rank"))
    
    words_candidates<-as.data.frame(words_candidates)
    
    result <-data.frame(words = NA,
                        example = NA)
    
    for(i in words_candidates$words_candidates) {
      a <- grep(pattern = i, list_ncs$context, ignore.case = TRUE)
      if (length(a) > 0) {
        b <- data.frame(words = rep(i, length(a)),
                        example = a)
        result <- rbind(result, b)
      }
    }
    result<-result[-1,]
    list_ncs<-list_ncs[result$example,]
    list_ncs<-cbind(list_ncs, result) %>% select(-target, -rank)
    
    for (i in 1:nrow(list_ncs)){
      key<-gsub(pattern = "text", replacement = "", list_ncs$text[i], ignore.case = TRUE)
      key<-as.numeric(key)
      b<-demo_dem@docvars$full_text[key]
      list_ncs$real[i]<-b
    }
    
    list_ncs_top3 <- list_ncs %>% group_by(words) %>% arrange(desc(value)) %>% slice_head(n = 3)
    saveRDS(list_ncs_top3, file = "validation_electoral2_democracy.rds")
    
    #correlation plot
    # mat : is a matrix of data
    # ... : further arguments to pass to the native R cor.test function
      cor.mtest <- function(mat, ...) {
        mat <- as.matrix(mat)
        n <- ncol(mat)
        p.mat<- matrix(NA, n, n)
        diag(p.mat) <- 0
        for (i in 1:(n - 1)) {
          for (j in (i + 1):n) {
            tmp <- cor.test(mat[, i], mat[, j], ...)
            p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
          }
        }
        colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
        p.mat
      }  
    
      corr<-target_cos_month %>% select(target, feature, value) %>% pivot_wider(names_from = feature, values_from = value)
      row.names(corr)<-corr$target
      corr<-corr[,-1]
      M <- cor(corr)

      # matrix of the p-value of the correlation
      p.mat <- cor.mtest(corr)
  
      col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
      corrplot(M, method="color", col=col(200),  
               type="upper", order="hclust", 
               addCoef.col = "black", # Add coefficient of correlation
               tl.col="black", tl.srt=45, #Text label color and rotation
               # Combine with significance
               p.mat = p.mat, sig.level = 0.01, insig = "blank", 
               # hide correlation coefficient on the principal diagonal
               diag=FALSE 
               )
      #save with 1600 width
    
##############################
#
# 2. Cosine similarity for Liberal Democracy
#
##############################
  
  feats <- featnames(dfm(demo_toks))

  # Liberal Democracy
    target_word <- c("freedom", "rights", "constitution", "liberty", "law")  # checks and balances is not found
    
    list_nns<-list()
    
    for (target_word_i in target_word){
      
      #Create context
      context_tock <- tokens_context(x = toks_nostop_feats, pattern = target_word_i, window = 6L,
                                  valuetype = "glob", case_insensitive = TRUE, hard_cut = FALSE, rm_keyword = TRUE,
                                  verbose = TRUE)
      
      docvars(context_tock)$global<-1
      set.seed(2021L)
      word_nns <- get_nns(x = context_tock,
                          N = 5,
                          groups = docvars(context_tock, 'global'),
                          candidates = feats,
                          pre_trained = glove_vectors,
                          transform = TRUE,
                          transform_matrix = transform_vectors,
                          bootstrap = TRUE,
                          num_bootstraps = 100,
                          confidence_level = 0.95,
                          stem = FALSE,
                          as_list = FALSE)
      
      list_nns[[target_word_i]]<-word_nns
    }  
  
    result_df <- data.frame(target = NA,
                            feature = NA,
                            rank = NA,
                            value = NA,
                            std.error = NA,
                            lower.ci = NA,
                            upper.ci = NA)
  
    for (target_word_i in target_word) {
        b <- list_nns[[target_word_i]]  # Assuming demo_month_nns is a list
        b$target<-target_word_i
        result_df <- rbind(result_df, b)
      }
    result_df<-result_df[-1,]
    
    words_candidates <- unique(result_df$feature) # Read the key words and sort out relevant ones
    words_candidates
    words_candidates<-words_candidates[!(words_candidates %in% c("amending", "protections"))]
    words_candidates <- unique(c(words_candidates, target_word, "constitutionalism"))
    
    #Connections to Democracy
    set.seed(2021L)
    target_cos_global<-get_cos_sim(x = demo_toks,
                                   groups = docvars(demo_toks, 'global'),
                                   features = words_candidates,
                                   pre_trained = glove_vectors,
                                   transform = TRUE,
                                   transform_matrix = transform_vectors,
                                   bootstrap = TRUE,
                                   num_bootstraps = 100,
                                   confidence_level = 0.95,
                                   stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                   as_list = FALSE)
    set.seed(2021L)
    target_cos_year<-get_cos_sim(x = demo_toks,
                                 groups = docvars(demo_toks, 'pub_year_factor'),
                                 features = words_candidates,
                                 pre_trained = glove_vectors,
                                 transform = TRUE,
                                 transform_matrix = transform_vectors,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 confidence_level = 0.95,
                                 stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                 as_list = FALSE)
    set.seed(2021L)
    target_cos_month<-get_cos_sim(x = demo_toks,
                                  groups = docvars(demo_toks, 'pub_month_factor'),
                                  features = words_candidates,
                                  pre_trained = glove_vectors,
                                  transform = TRUE,
                                  transform_matrix = transform_vectors,
                                  bootstrap = TRUE,
                                  num_bootstraps = 100,
                                  confidence_level = 0.95,
                                  stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                  as_list = FALSE)
    
    target_cos_month$target<-ymd(target_cos_month$target)
    
    print(target_cos_global %>% arrange(desc(value)), n=50)
    mean(target_cos_global$value)
    
    target_cos_year %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_liberal2_year.jpeg", width = 8, dpi = 1000)
    
    target_cos_month %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_liberal2_month.jpeg", width = 8, dpi = 1000)
    
    target_cos_year %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    target_cos_month %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    
  #Check the real usages
    list_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = demo_toks, N = 29440, as_list = FALSE)
    text_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = NULL, N = 29440, as_list = FALSE)
    
    list_ncs<-merge(list_ncs, text_ncs %>% select(-value, -target) %>% rename(text_num = context), by=c("rank"))
    
    words_candidates<-as.data.frame(words_candidates)
    
    result <-data.frame(words = NA,
                        example = NA)
    
    for(i in words_candidates$words_candidates) {
      a <- grep(pattern = i, list_ncs$context, ignore.case = TRUE)
      if (length(a) > 0) {
        b <- data.frame(words = rep(i, length(a)),
                        example = a)
        result <- rbind(result, b)
      }
    }
    result<-result[-1,]
    list_ncs<-list_ncs[result$example,]
    list_ncs<-cbind(list_ncs, result) %>% select(-target, -rank)
    
    for (i in 1:nrow(list_ncs)){
      key<-gsub(pattern = "text", replacement = "", list_ncs$text[i], ignore.case = TRUE)
      key<-as.numeric(key)
      b<-demo_dem@docvars$full_text[key]
      list_ncs$real[i]<-b
    }
    
    list_ncs_top3 <- list_ncs %>% group_by(words) %>% arrange(desc(value)) %>% slice_head(n = 3)
    saveRDS(list_ncs_top3, file = "validation_liberal2_democracy.rds")  
    
    #correlation plot
    corr<-target_cos_month %>% select(target, feature, value) %>% pivot_wider(names_from = feature, values_from = value)
    row.names(corr)<-corr$target
    corr<-corr[,-1]
    M <- cor(corr)
    
    # matrix of the p-value of the correlation
    p.mat <- cor.mtest(corr)
    
    col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
    corrplot(M, method="color", col=col(200),  
             type="upper", order="hclust", 
             addCoef.col = "black", # Add coefficient of correlation
             tl.col="black", tl.srt=45, #Text label color and rotation
             # Combine with significance
             p.mat = p.mat, sig.level = 0.01, insig = "blank", 
             # hide correlation coefficient on the principal diagonal
             diag=FALSE 
    )
    #save with 1600 width

##############################
#
# 3. Cosine similarity for participatory Democracy
#
##############################
  
  feats <- featnames(dfm(demo_toks))
  
  # Democracy
    target_word <- c("participation", "grassroots", "protest", "referendum", "movement", "non-governmental")
    
    list_nns<-list()
    
    for (target_word_i in target_word){
      
      #Create context
      context_tock <- tokens_context(x = toks_nostop_feats, pattern = target_word_i, window = 6L,
                                  valuetype = "glob", case_insensitive = TRUE, hard_cut = FALSE, rm_keyword = TRUE,
                                  verbose = TRUE)
      
      docvars(context_tock)$global<-1
      set.seed(2021L)
      word_nns <- get_nns(x = context_tock,
                          N = 5,
                          groups = docvars(context_tock, 'global'),
                          candidates = feats,
                          pre_trained = glove_vectors,
                          transform = TRUE,
                          transform_matrix = transform_vectors,
                          bootstrap = TRUE,
                          num_bootstraps = 100,
                          confidence_level = 0.95,
                          stem = FALSE,
                          as_list = FALSE)
      
      list_nns[[target_word_i]]<-word_nns
    }  
  
    result_df <- data.frame(target = NA,
                            feature = NA,
                            rank = NA,
                            value = NA,
                            std.error = NA,
                            lower.ci = NA,
                            upper.ci = NA)
  
    for (target_word_i in target_word) {
        b <- list_nns[[target_word_i]]  # Assuming demo_month_nns is a list
        b$target<-target_word_i
        result_df <- rbind(result_df, b)
      }
    result_df<-result_df[-1,]
    
    words_candidates <- unique(result_df$feature) # Read the key words and sort out relevant ones
    words_candidates
    words_candidates<-words_candidates[!(words_candidates %in% c("advocacy", "societal", "disobedience", "fostering",
                                                                 "ballot", "elections", "championing", "gun-control",
                                                                 "espousing", "governmental", "organizations"))]
    words_candidates <- unique(c(words_candidates, target_word))
    
    #Connections to Democracy
    set.seed(2021L)
    target_cos_global<-get_cos_sim(x = demo_toks,
                                   groups = docvars(demo_toks, 'global'),
                                   features = words_candidates,
                                   pre_trained = glove_vectors,
                                   transform = TRUE,
                                   transform_matrix = transform_vectors,
                                   bootstrap = TRUE,
                                   num_bootstraps = 100,
                                   confidence_level = 0.95,
                                   stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                   as_list = FALSE)
    set.seed(2021L)
    target_cos_year<-get_cos_sim(x = demo_toks,
                                 groups = docvars(demo_toks, 'pub_year_factor'),
                                 features = words_candidates,
                                 pre_trained = glove_vectors,
                                 transform = TRUE,
                                 transform_matrix = transform_vectors,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 confidence_level = 0.95,
                                 stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                 as_list = FALSE)
    set.seed(2021L)
    target_cos_month<-get_cos_sim(x = demo_toks,
                                  groups = docvars(demo_toks, 'pub_month_factor'),
                                  features = words_candidates,
                                  pre_trained = glove_vectors,
                                  transform = TRUE,
                                  transform_matrix = transform_vectors,
                                  bootstrap = TRUE,
                                  num_bootstraps = 100,
                                  confidence_level = 0.95,
                                  stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                  as_list = FALSE)
    
    target_cos_month$target<-ymd(target_cos_month$target)
    
    print(target_cos_global %>% arrange(desc(value)), n=50)
    mean(target_cos_global$value)
    
    target_cos_year %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_participatory2_year.jpeg", width = 8, dpi = 1000)
    
    target_cos_month %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_participatory2_month.jpeg", width = 8, dpi = 1000)
    
    target_cos_year %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    target_cos_month %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    
  #Check the real usages
    list_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = demo_toks, N = 29440, as_list = FALSE)
    text_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = NULL, N = 29440, as_list = FALSE)
    
    list_ncs<-merge(list_ncs, text_ncs %>% select(-value, -target) %>% rename(text_num = context), by=c("rank"))
    
    words_candidates<-as.data.frame(words_candidates)
    
    result <-data.frame(words = NA,
                        example = NA)
    
    for(i in words_candidates$words_candidates) {
      a <- grep(pattern = i, list_ncs$context, ignore.case = TRUE)
      if (length(a) > 0) {
        b <- data.frame(words = rep(i, length(a)),
                        example = a)
        result <- rbind(result, b)
      }
    }
    result<-result[-1,]
    list_ncs<-list_ncs[result$example,]
    list_ncs<-cbind(list_ncs, result) %>% select(-target, -rank)
    
    for (i in 1:nrow(list_ncs)){
      key<-gsub(pattern = "text", replacement = "", list_ncs$text[i], ignore.case = TRUE)
      key<-as.numeric(key)
      b<-demo_dem@docvars$full_text[key]
      list_ncs$real[i]<-b
    }
    
    list_ncs_top3 <- list_ncs %>% group_by(words) %>% arrange(desc(value)) %>% slice_head(n = 3)
    saveRDS(list_ncs_top3, file = "validation_participatory2_democracy.rds")
    
  #correlation plot
    corr<-target_cos_month %>% select(target, feature, value) %>% pivot_wider(names_from = feature, values_from = value)
    row.names(corr)<-corr$target
    corr<-corr[,-1]
    M <- cor(corr)
    
    # matrix of the p-value of the correlation
    p.mat <- cor.mtest(corr)
    
    col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
    corrplot(M, method="color", col=col(200),  
             type="upper", order="hclust", 
             addCoef.col = "black", # Add coefficient of correlation
             tl.col="black", tl.srt=45, #Text label color and rotation
             # Combine with significance
             p.mat = p.mat, sig.level = 0.01, insig = "blank", 
             # hide correlation coefficient on the principal diagonal
             diag=FALSE 
    )
    #save with 1600 width
    
##############################
#
# 4. Cosine similarity for Deliberative Democracy
#
##############################
  
  feats <- featnames(dfm(demo_toks))
  
  # Democracy
    target_word <- c("dialogue", "deliberative", "deliberation", "reasoning", "consensus")  # townhall is not found
    
    list_nns<-list()
    
    for (target_word_i in target_word){
      
      #Create context
      context_tock <- tokens_context(x = toks_nostop_feats, pattern = target_word_i, window = 6L,
                                  valuetype = "glob", case_insensitive = TRUE, hard_cut = FALSE, rm_keyword = TRUE,
                                  verbose = TRUE)
      
      docvars(context_tock)$global<-1
      set.seed(2021L)
      word_nns <- get_nns(x = context_tock,
                          N = 5,
                          groups = docvars(context_tock, 'global'),
                          candidates = feats,
                          pre_trained = glove_vectors,
                          transform = TRUE,
                          transform_matrix = transform_vectors,
                          bootstrap = TRUE,
                          num_bootstraps = 100,
                          confidence_level = 0.95,
                          stem = FALSE,
                          as_list = FALSE)
      
      list_nns[[target_word_i]]<-word_nns
    }  
  
    result_df <- data.frame(target = NA,
                            feature = NA,
                            rank = NA,
                            value = NA,
                            std.error = NA,
                            lower.ci = NA,
                            upper.ci = NA)
  
    for (target_word_i in target_word) {
        b <- list_nns[[target_word_i]]  # Assuming demo_month_nns is a list
        b$target<-target_word_i
        result_df <- rbind(result_df, b)
      }
    result_df<-result_df[-1,]
    
    words_candidates <- unique(result_df$feature) # Read the key words and sort out relevant ones
    words_candidates
    words_candidates<-words_candidates[!(words_candidates %in% c("fostering", "substantive", "decision-making", "common-sense",
                                                                 "participatory", "intuitions", "worldviews", "societal", "agendas",
                                                                 "viewpoints"))]
    words_candidates <- unique(c(words_candidates, target_word))
    
    #Connections to Democracy
    set.seed(2021L)
    target_cos_global<-get_cos_sim(x = demo_toks,
                                   groups = docvars(demo_toks, 'global'),
                                   features = words_candidates,
                                   pre_trained = glove_vectors,
                                   transform = TRUE,
                                   transform_matrix = transform_vectors,
                                   bootstrap = TRUE,
                                   num_bootstraps = 100,
                                   confidence_level = 0.95,
                                   stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                   as_list = FALSE)
    set.seed(2021L)
    target_cos_year<-get_cos_sim(x = demo_toks,
                                 groups = docvars(demo_toks, 'pub_year_factor'),
                                 features = words_candidates,
                                 pre_trained = glove_vectors,
                                 transform = TRUE,
                                 transform_matrix = transform_vectors,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 confidence_level = 0.95,
                                 stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                 as_list = FALSE)
    set.seed(2021L)
    target_cos_month<-get_cos_sim(x = demo_toks,
                                  groups = docvars(demo_toks, 'pub_month_factor'),
                                  features = words_candidates,
                                  pre_trained = glove_vectors,
                                  transform = TRUE,
                                  transform_matrix = transform_vectors,
                                  bootstrap = TRUE,
                                  num_bootstraps = 100,
                                  confidence_level = 0.95,
                                  stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                  as_list = FALSE)
    
    target_cos_month$target<-ymd(target_cos_month$target)
    
    print(target_cos_global %>% arrange(desc(value)), n=50)
    mean(target_cos_global$value)
    
    target_cos_year %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_deliberative2_year.jpeg", width = 8, dpi = 1000)
    
    target_cos_month %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_deliberative2_month.jpeg", width = 8, dpi = 1000)
    
    target_cos_year %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    target_cos_month %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    
  #Check the real usages
    list_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = demo_toks, N = 29440, as_list = FALSE)
    text_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = NULL, N = 29440, as_list = FALSE)
    
    list_ncs<-merge(list_ncs, text_ncs %>% select(-value, -target) %>% rename(text_num = context), by=c("rank"))
    
    words_candidates<-as.data.frame(words_candidates)
    
    result <-data.frame(words = NA,
                        example = NA)
    
    for(i in words_candidates$words_candidates) {
      a <- grep(pattern = i, list_ncs$context, ignore.case = TRUE)
      if (length(a) > 0) {
        b <- data.frame(words = rep(i, length(a)),
                        example = a)
        result <- rbind(result, b)
      }
    }
    result<-result[-1,]
    list_ncs<-list_ncs[result$example,]
    list_ncs<-cbind(list_ncs, result) %>% select(-target, -rank)
    
    for (i in 1:nrow(list_ncs)){
      key<-gsub(pattern = "text", replacement = "", list_ncs$text[i], ignore.case = TRUE)
      key<-as.numeric(key)
      b<-demo_dem@docvars$full_text[key]
      list_ncs$real[i]<-b
    }
    
    list_ncs_top3 <- list_ncs %>% group_by(words) %>% arrange(desc(value)) %>% slice_head(n = 3)
    saveRDS(list_ncs_top3, file = "validation_delibrative2_democracy.rds")
    
    #correlation plot
    corr<-target_cos_month %>% select(target, feature, value) %>% pivot_wider(names_from = feature, values_from = value)
    row.names(corr)<-corr$target
    corr<-corr[,-1]
    M <- cor(corr)
    
    # matrix of the p-value of the correlation
    p.mat <- cor.mtest(corr)
    
    col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
    corrplot(M, method="color", col=col(200),  
             type="upper", order="hclust", 
             addCoef.col = "black", # Add coefficient of correlation
             tl.col="black", tl.srt=45, #Text label color and rotation
             # Combine with significance
             p.mat = p.mat, sig.level = 0.01, insig = "blank", 
             # hide correlation coefficient on the principal diagonal
             diag=FALSE 
    )
    #save with 1600 width

##############################
#
# 5. Cosine similarity for Egalitarian Democracy
#
##############################
  
  feats <- featnames(dfm(demo_toks))
  
  # Democracy
    target_word <- c("egalitarian", "egalitarianism", "equal", "welfare", "social class", "inequality")  # equality is not found
    
    list_nns<-list()
    
    for (target_word_i in target_word){
      
      #Create context
      context_tock <- tokens_context(x = toks_nostop_feats, pattern = target_word_i, window = 6L,
                                  valuetype = "glob", case_insensitive = TRUE, hard_cut = FALSE, rm_keyword = TRUE,
                                  verbose = TRUE)
      
      docvars(context_tock)$global<-1
      set.seed(2021L)
      word_nns <- get_nns(x = context_tock,
                          N = 5,
                          groups = docvars(context_tock, 'global'),
                          candidates = feats,
                          pre_trained = glove_vectors,
                          transform = TRUE,
                          transform_matrix = transform_vectors,
                          bootstrap = TRUE,
                          num_bootstraps = 100,
                          confidence_level = 0.95,
                          stem = FALSE,
                          as_list = FALSE)
      
      list_nns[[target_word_i]]<-word_nns
    }  
  
    result_df <- data.frame(target = NA,
                            feature = NA,
                            rank = NA,
                            value = NA,
                            std.error = NA,
                            lower.ci = NA,
                            upper.ci = NA)
  
    for (target_word_i in target_word) {
        b <- list_nns[[target_word_i]]  # Assuming demo_month_nns is a list
        b$target<-target_word_i
        result_df <- rbind(result_df, b)
      }
    result_df<-result_df[-1,]
    
    words_candidates <- unique(result_df$feature) # Read the key words and sort out relevant ones
    words_candidates
    words_candidates<-words_candidates[!(words_candidates %in% c("individualistic","pluralistic", "ideals", "enlightened", 
                                                                 "individualism",  "liberalism",  "guaranteeing",
                                                                 "bureaucracies","reforming", "backgrounds"))]
    words_candidates <- unique(c(words_candidates, target_word,"socialism"))
    
    #Connections to Democracy
    set.seed(2021L)
    target_cos_global<-get_cos_sim(x = demo_toks,
                                   groups = docvars(demo_toks, 'global'),
                                   features = words_candidates,
                                   pre_trained = glove_vectors,
                                   transform = TRUE,
                                   transform_matrix = transform_vectors,
                                   bootstrap = TRUE,
                                   num_bootstraps = 100,
                                   confidence_level = 0.95,
                                   stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                   as_list = FALSE)
    set.seed(2021L)
    target_cos_year<-get_cos_sim(x = demo_toks,
                                 groups = docvars(demo_toks, 'pub_year_factor'),
                                 features = words_candidates,
                                 pre_trained = glove_vectors,
                                 transform = TRUE,
                                 transform_matrix = transform_vectors,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 confidence_level = 0.95,
                                 stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                 as_list = FALSE)
    set.seed(2021L)
    target_cos_month<-get_cos_sim(x = demo_toks,
                                  groups = docvars(demo_toks, 'pub_month_factor'),
                                  features = words_candidates,
                                  pre_trained = glove_vectors,
                                  transform = TRUE,
                                  transform_matrix = transform_vectors,
                                  bootstrap = TRUE,
                                  num_bootstraps = 100,
                                  confidence_level = 0.95,
                                  stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                  as_list = FALSE)
    
    target_cos_month$target<-ymd(target_cos_month$target)
    
    print(target_cos_global %>% arrange(desc(value)), n=50)
    mean(target_cos_global$value)
    
    target_cos_year %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_egalitarian2_year.jpeg", width = 8, dpi = 1000)
    
    target_cos_month %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_egalitarian2_month.jpeg", width = 8, dpi = 1000)
    
    target_cos_year %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    print(target_cos_month %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean)), n=50)
    
  #Check the real usages
    list_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = demo_toks, N = 29440, as_list = FALSE)
    text_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = NULL, N = 29440, as_list = FALSE)
    
    list_ncs<-merge(list_ncs, text_ncs %>% select(-value, -target) %>% rename(text_num = context), by=c("rank"))
    
    words_candidates<-as.data.frame(words_candidates)
    
    result <-data.frame(words = NA,
                        example = NA)
    
    for(i in words_candidates$words_candidates) {
      a <- grep(pattern = i, list_ncs$context, ignore.case = TRUE)
      if (length(a) > 0) {
        b <- data.frame(words = rep(i, length(a)),
                        example = a)
        result <- rbind(result, b)
      }
    }
    result<-result[-1,]
    list_ncs<-list_ncs[result$example,]
    list_ncs<-cbind(list_ncs, result) %>% select(-target, -rank)
    
    for (i in 1:nrow(list_ncs)){
      key<-gsub(pattern = "text", replacement = "", list_ncs$text[i], ignore.case = TRUE)
      key<-as.numeric(key)
      b<-demo_dem@docvars$full_text[key]
      list_ncs$real[i]<-b
    }
    
    list_ncs_top3 <- list_ncs %>% group_by(words) %>% arrange(desc(value)) %>% slice_head(n = 3)
    saveRDS(list_ncs_top3, file = "validation_egalitarian2_democracy.rds")
    
    #correlation plot
    corr<-target_cos_month %>% select(target, feature, value) %>% pivot_wider(names_from = feature, values_from = value)
    row.names(corr)<-corr$target
    corr<-corr[,-1]
    M <- cor(corr)
    
    # matrix of the p-value of the correlation
    p.mat <- cor.mtest(corr)
    
    col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
    corrplot(M, method="color", col=col(200),  
             type="upper", order="hclust", 
             addCoef.col = "black", # Add coefficient of correlation
             tl.col="black", tl.srt=45, #Text label color and rotation
             # Combine with significance
             p.mat = p.mat, sig.level = 0.01, insig = "blank", 
             # hide correlation coefficient on the principal diagonal
             diag=FALSE 
    )
    #save with 1600 width

    
##############################
#
# 5. Cosine similarity for authoritarian Democracy
#
##############################
  
  feats <- featnames(dfm(demo_toks))
  
  #Only one-length term can be used (ex. civil liberty cannot be used)
  
  # Democracy
    target_word <- c("executive order", "unilateral", "meritocratic", "top-down", "charisma")  # equality is not found
    
    list_nns<-list()
    
    for (target_word_i in target_word){
      
      #Create context
      context_tock <- tokens_context(x = toks_nostop_feats, pattern = target_word_i, window = 6L,
                                  valuetype = "glob", case_insensitive = TRUE, hard_cut = FALSE, rm_keyword = TRUE,
                                  verbose = TRUE)
      
      docvars(context_tock)$global<-1
      set.seed(2021L)
      word_nns <- get_nns(x = context_tock,
                          N = 5,
                          groups = docvars(context_tock, 'global'),
                          candidates = feats,
                          pre_trained = glove_vectors,
                          transform = TRUE,
                          transform_matrix = transform_vectors,
                          bootstrap = TRUE,
                          num_bootstraps = 100,
                          confidence_level = 0.95,
                          stem = FALSE,
                          as_list = FALSE)
      
      list_nns[[target_word_i]]<-word_nns
    }  
  
    result_df <- data.frame(target = NA,
                            feature = NA,
                            rank = NA,
                            value = NA,
                            std.error = NA,
                            lower.ci = NA,
                            upper.ci = NA)
  
    for (target_word_i in target_word) {
        b <- list_nns[[target_word_i]]  # Assuming demo_month_nns is a list
        b$target<-target_word_i
        result_df <- rbind(result_df, b)
      }
    result_df<-result_df[-1,]
    
    words_candidates <- unique(result_df$feature) # Read the key words and sort out relevant ones
    words_candidates
    words_candidates<-words_candidates[!(words_candidates %in% c("prohibiting", "forbidding", "permitting", "banning",
                                                                 "prohibit", "entail", "escalation", "stance",
                                                                 "egalitarian", "individualistic", "decentralized"))]
    words_candidates <- unique(c(words_candidates, target_word))
    
    #Connections to Democracy
    set.seed(2021L)
    target_cos_global<-get_cos_sim(x = demo_toks,
                                   groups = docvars(demo_toks, 'global'),
                                   features = words_candidates,
                                   pre_trained = glove_vectors,
                                   transform = TRUE,
                                   transform_matrix = transform_vectors,
                                   bootstrap = TRUE,
                                   num_bootstraps = 100,
                                   confidence_level = 0.95,
                                   stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                   as_list = FALSE)
    set.seed(2021L)
    target_cos_year<-get_cos_sim(x = demo_toks,
                                 groups = docvars(demo_toks, 'pub_year_factor'),
                                 features = words_candidates,
                                 pre_trained = glove_vectors,
                                 transform = TRUE,
                                 transform_matrix = transform_vectors,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 confidence_level = 0.95,
                                 stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                 as_list = FALSE)
    set.seed(2021L)
    target_cos_month<-get_cos_sim(x = demo_toks,
                                  groups = docvars(demo_toks, 'pub_month_factor'),
                                  features = words_candidates,
                                  pre_trained = glove_vectors,
                                  transform = TRUE,
                                  transform_matrix = transform_vectors,
                                  bootstrap = TRUE,
                                  num_bootstraps = 100,
                                  confidence_level = 0.95,
                                  stem = FALSE,  # We are going to stem. So, don't put star at keywords
                                  as_list = FALSE)
    
    target_cos_month$target<-ymd(target_cos_month$target)
    
    print(target_cos_global %>% arrange(desc(value)), n=50)
    mean(target_cos_global$value)
    
    target_cos_year %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_elitist2_year.jpeg", width = 8, dpi = 1000)
    
    target_cos_month %>% 
      ggplot() +
      geom_line(aes(x = target, y = value))+
      geom_hline(yintercept = 0, color = "red")+
      xlab("") + 
      ylab("") +
      scale_color_viridis(discrete = TRUE, option = "A")+
      scale_fill_viridis(discrete = TRUE) +
      facet_wrap(~feature)+
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1))
    ggsave(filename = "fig/dictionary_elitist2_month.jpeg", width = 8, dpi = 1000)
    
    target_cos_year %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean))
    print(target_cos_month %>% group_by(feature) %>% summarise(mean= mean(value)) %>% arrange(desc(mean)), n=50)
    
  #Check the real usages
    list_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = demo_toks, N = 29440, as_list = FALSE)
    text_ncs<-ncs(x = demo_wv, contexts_dem = demo_dem, contexts = NULL, N = 29440, as_list = FALSE)
    
    list_ncs<-merge(list_ncs, text_ncs %>% select(-value, -target) %>% rename(text_num = context), by=c("rank"))
    
    words_candidates<-as.data.frame(words_candidates)
    
    result <-data.frame(words = NA,
                        example = NA)
    
    for(i in words_candidates$words_candidates) {
      a <- grep(pattern = i, list_ncs$context, ignore.case = TRUE)
      if (length(a) > 0) {
        b <- data.frame(words = rep(i, length(a)),
                        example = a)
        result <- rbind(result, b)
      }
    }
    result<-result[-1,]
    list_ncs<-list_ncs[result$example,]
    list_ncs<-cbind(list_ncs, result) %>% select(-target, -rank)
    
    for (i in 1:nrow(list_ncs)){
      key<-gsub(pattern = "text", replacement = "", list_ncs$text[i], ignore.case = TRUE)
      key<-as.numeric(key)
      b<-demo_dem@docvars$full_text[key]
      list_ncs$real[i]<-b
    }
    
    list_ncs_top3 <- list_ncs %>% group_by(words) %>% arrange(desc(value)) %>% slice_head(n = 3)
    saveRDS(list_ncs_top3, file = "validation_elitist2_democracy.rds")
    
    #correlation plot
    corr<-target_cos_month %>% select(target, feature, value) %>% pivot_wider(names_from = feature, values_from = value)
    row.names(corr)<-corr$target
    corr<-corr[,-1]
    M <- cor(corr)
    
    # matrix of the p-value of the correlation
    p.mat <- cor.mtest(corr)
    
    col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
    corrplot(M, method="color", col=col(200),  
             type="upper", order="hclust", 
             addCoef.col = "black", # Add coefficient of correlation
             tl.col="black", tl.srt=45, #Text label color and rotation
             # Combine with significance
             p.mat = p.mat, sig.level = 0.01, insig = "blank", 
             # hide correlation coefficient on the principal diagonal
             diag=FALSE 
    )
    #save with 1600 width
    